{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn import preprocessing\n",
    "import numpy as np\n",
    "\"\"\"\n",
    "original_data_path = \"../original_data/bank-additional.csv\"\n",
    "processed_data_path = \"../processed_data/\"\n",
    "\"\"\"\n",
    "original_data_path = \"../original_data/bank-additional-full.csv\"\n",
    "processed_data_path = \"../processed_data/\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "bank_data_small  = pd.read_csv(original_data_path,sep=\";\") \n",
    "#print(bank_data_small_train.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "按照数据类型对特征进行简单分类（数值型特征，字符型特征）\n",
    "其中字符型特征又分为：二分类特征，有序特征和无序特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "string_features = bank_data_small.columns[bank_data_small.dtypes  == \"object\"].to_series().values\n",
    "int_features = bank_data_small.columns[bank_data_small.dtypes  == \"int64\"].to_series().values\n",
    "float_features = bank_data_small.columns[bank_data_small.dtypes  == \"float64\"].to_series().values\n",
    "numeric_features = np.append(int_features,float_features)\n",
    "\n",
    "bin_features = ['default', 'housing', 'loan','y']\n",
    "order_features = ['education']\n",
    "disorder_features = ['poutcome', 'job', 'marital', 'contact', 'month','day_of_week']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "三种不同的缺失值填充方法：平均值填充，k近邻填充和随机森林填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用均值填补缺失值\n",
    "def Missing_value_perprocessing_mean (bank_data_small_train,bank_data_small_test):\n",
    "    col  = bank_data_small_train.columns\n",
    "    #Train_copy = Train.copy()\n",
    "    #直接使用平均值填补缺失值\n",
    "    from sklearn.preprocessing import Imputer\n",
    "    imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)\n",
    "    imp.fit(bank_data_small_train)\n",
    "    bank_data_small_train = imp.transform(bank_data_small_train) \n",
    "    bank_data_small_test = imp.transform(bank_data_small_test) \n",
    "    bank_data_small_train = pd.DataFrame(bank_data_small_train,columns = col)\n",
    "    bank_data_small_test = pd.DataFrame(bank_data_small_test,columns = col)\n",
    "    return bank_data_small_train,bank_data_small_test \n",
    "\n",
    "#使用随机森林填补缺失值\n",
    "def Missing_value_perprocessing_rf (bank_data_small_train,bank_data_small_test):\n",
    "    Missing_features_dict = {}\n",
    "    Missing_features_name = []\n",
    "    #先统计哪些列存在缺失的数据\n",
    "    for feature in bank_data_small_train.columns:\n",
    "        Missing_count = bank_data_small_train[bank_data_small_train[feature].isnull()]['age'].count() \n",
    "        if Missing_count > 0:\n",
    "            # 统计包含缺失值的列\n",
    "            Missing_features_dict.update({feature: Missing_count})\n",
    "    #对缺失的数据列按照缺失值数量从少到多排序，先拟合缺失值少的列        \n",
    "    Missing_features_name = sorted(Missing_features_dict.keys(),reverse=True) \n",
    "    #print(Missing_features_name)\n",
    "    for feature in Missing_features_name:     \n",
    "        #训练集中有缺失值的数据\n",
    "        train_miss_data = bank_data_small_train[bank_data_small_train[feature].isnull()]\n",
    "        train_miss_data_X = train_miss_data.drop(Missing_features_name, axis=1)\n",
    "        #训练集中没有缺失值的数据\n",
    "        train_full_data = bank_data_small_train[bank_data_small_train[feature].notnull()]     \n",
    "        train_full_data_Y = train_full_data[feature]\n",
    "        train_full_data_X = train_full_data.drop(Missing_features_name, axis=1)\n",
    "        #测试集中有缺失值的数据\n",
    "        test_miss_data = bank_data_small_test[bank_data_small_test[feature].isnull()]\n",
    "        test_miss_data_X = test_miss_data.drop(Missing_features_name, axis=1)\n",
    "        #测试集中没有缺失值的数据\n",
    "        test_full_data = bank_data_small_test[bank_data_small_test[feature].notnull()]     \n",
    "        test_full_data_Y = test_full_data[feature]\n",
    "        test_full_data_X = test_full_data.drop(Missing_features_name, axis=1)\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        #使用随机森林拟合        \n",
    "        rf = RandomForestClassifier(n_estimators=100)\n",
    "        #利用训练集中没有缺失值的数据构建随机森林\n",
    "        rf.fit(train_full_data_X, train_full_data_Y)\n",
    "        #预测训练集中的缺失值\n",
    "        train_miss_data_Y = rf.predict(train_miss_data_X)\n",
    "        #预测测试集中的缺失值\n",
    "        test_miss_data_Y = rf.predict(test_miss_data_X) \n",
    "        #将训练集中的缺失值补充完整\n",
    "        train_miss_data[feature] = train_miss_data_Y  \n",
    "        #将测试集中的缺失值补充完整\n",
    "        test_miss_data[feature] = test_miss_data_Y \n",
    "        #将补充完整的\n",
    "        bank_data_small_train = pd.concat([train_full_data, train_miss_data])\n",
    "        bank_data_small_test = pd.concat([test_full_data, test_miss_data])      \n",
    "        \n",
    "    return bank_data_small_train,bank_data_small_test\n",
    "\n",
    "#使用knn填补缺失值\n",
    "def Missing_value_perprocessing_knn (bank_data_small_train,bank_data_small_test):\n",
    "    Missing_features_dict = {}\n",
    "    Missing_features_name = []\n",
    "    #先统计哪些列存在缺失的数据\n",
    "    for feature in bank_data_small_train.columns:\n",
    "        Missing_count = bank_data_small_train[bank_data_small_train[feature].isnull()]['age'].count() \n",
    "        if Missing_count > 0:\n",
    "            # 统计包含缺失值的列\n",
    "            Missing_features_dict.update({feature: Missing_count})\n",
    "    #对缺失的数据列按照缺失值数量从少到多排序，先拟合缺失值少的列        \n",
    "    Missing_features_name = sorted(Missing_features_dict.keys(),reverse=True)\n",
    "    from sklearn.neighbors import KNeighborsClassifier \n",
    "    for feature in Missing_features_name:     \n",
    "        #训练集中有缺失值的数据\n",
    "        train_miss_data = bank_data_small_train[bank_data_small_train[feature].isnull()]\n",
    "        train_miss_data_X = train_miss_data.drop(Missing_features_name, axis=1)\n",
    "        #训练集中没有缺失值的数据\n",
    "        train_full_data = bank_data_small_train[bank_data_small_train[feature].notnull()]     \n",
    "        train_full_data_Y = train_full_data[feature]\n",
    "        train_full_data_X = train_full_data.drop(Missing_features_name, axis=1)\n",
    "        #测试集中有缺失值的数据\n",
    "        test_miss_data = bank_data_small_test[bank_data_small_test[feature].isnull()]\n",
    "        test_miss_data_X = test_miss_data.drop(Missing_features_name, axis=1)\n",
    "        #测试集中没有缺失值的数据\n",
    "        test_full_data = bank_data_small_test[bank_data_small_test[feature].notnull()]     \n",
    "        test_full_data_Y = test_full_data[feature]\n",
    "        test_full_data_X = test_full_data.drop(Missing_features_name, axis=1)\n",
    "        \n",
    "        #使用K近邻拟合        \n",
    "        knn = KNeighborsClassifier()\n",
    "        forest = knn.fit(train_full_data_X, train_full_data_Y)\n",
    "        \n",
    "        train_miss_data_Y = knn.predict(train_miss_data_X)\n",
    "        test_miss_data_Y = knn.predict(test_miss_data_X) \n",
    "        \n",
    "        train_miss_data[feature] = train_miss_data_Y      \n",
    "        test_miss_data[feature] = test_miss_data_Y \n",
    "\n",
    "        bank_data_small_train = pd.concat([train_full_data, train_miss_data])\n",
    "        bank_data_small_test = pd.concat([test_full_data, test_miss_data])      \n",
    "        \n",
    "    return bank_data_small_train,bank_data_small_test   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对特征值进行归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#归一化\n",
    "def Scale_perprocessing (Train):\n",
    "    col  = Train.columns\n",
    "    copy = Train.copy()\n",
    "    scaler = preprocessing.MinMaxScaler()\n",
    "    #新版本中fit_transform的第一个参数必须为二维矩阵\n",
    "    copy = scaler.fit_transform(copy)\n",
    "    Train = pd.DataFrame(copy,columns = col)\n",
    "    return Train "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "把二分类特征转化成（1，0）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#处理二分类的特征\n",
    "def bin_features_perprocessing (bin_features, bank_data):\n",
    "    for feature in bin_features:      \n",
    "        new = np.zeros(bank_data[feature].shape[0])\n",
    "        for rol in range(bank_data[feature].shape[0]):\n",
    "            if bank_data[feature][rol] == 'yes' :\n",
    "                new[rol] = 1\n",
    "            elif bank_data[feature][rol]  == 'no':\n",
    "                new[rol] = 0\n",
    "            else:\n",
    "                new[rol] = None\n",
    "        bank_data[feature] =  new   \n",
    "    return bank_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "把有序特征转化为序列（1，2，……，n）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#特征值有次序关系的特征，按照特征值强弱排序（如：受教育程度）\n",
    "def order_features_perprocessing (order_features,bank_data):\n",
    "    education_values = [\"illiterate\", \"basic.4y\", \"basic.6y\", \"basic.9y\", \n",
    "    \"high.school\",  \"professional.course\", \"university.degree\",\"unknown\"]\n",
    "    replace_values = list(range(1,  len(education_values)))\n",
    "    replace_values.append(None)\n",
    "    #除了replace也可以用map()\n",
    "    bank_data[order_features] = bank_data[order_features].replace(education_values,replace_values)\n",
    "    bank_data[order_features] = bank_data[order_features].astype(\"float\")\n",
    "    return bank_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对无序特征进行onehot编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#特征值没有次序的特征，一律使用onehot编码\n",
    "def disorder_features_perprocessing (disorder_features, bank_data):\n",
    "    for features in disorder_features:\n",
    "        #做onehot\n",
    "        features_onehot = pd.get_dummies(bank_data[features])\n",
    "        #把名字改成features_values\n",
    "        features_onehot = features_onehot.rename(columns=lambda x: features+'_'+str(x))\n",
    "        #拼接onehot得到的新features\n",
    "        bank_data = pd.concat([bank_data,features_onehot],axis=1)\n",
    "        #删掉原来的feature columns\n",
    "        bank_data = bank_data.drop(features, axis=1)\n",
    "    return bank_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:98: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:99: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "#将字符型特征转化为数值型特征\n",
    "#转化二分类特征为1，0\n",
    "bank_data_small = bin_features_perprocessing(bin_features, bank_data_small)\n",
    "#转化包含次序的特征\n",
    "bank_data_small = order_features_perprocessing(order_features, bank_data_small)\n",
    "#转化无序的特征\n",
    "bank_data_small = disorder_features_perprocessing(disorder_features, bank_data_small)\n",
    "\n",
    "#随机重排后，划分训练集和测试集\n",
    "bank_data_small.shape[0]\n",
    "round(bank_data_small.shape[0]*0.8)\n",
    "bank_data_small = bank_data_small.sample(frac=1,random_state=12)\n",
    "import math\n",
    "bank_data_small_train = bank_data_small.iloc[0:round(bank_data_small.shape[0]*0.8),:]\n",
    "bank_data_small_test = bank_data_small.iloc[round(bank_data_small.shape[0]*0.8):,:]\n",
    "\n",
    "#Missing_value_perprocessing\n",
    "#平均值\n",
    "#bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_mean(bank_data_small_train,bank_data_small_test)\n",
    "#k近邻\n",
    "bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_knn(bank_data_small_train,bank_data_small_test)\n",
    "#随机森林\n",
    "#bank_data_small_train,bank_data_small_test = Missing_value_perprocessing_rf(bank_data_small_train,bank_data_small_test) \n",
    "\n",
    "X_train_small = bank_data_small_train.drop(['y'], axis=1).copy()\n",
    "y_train_small = pd.DataFrame(bank_data_small_train['y'],columns = ['y'])\n",
    "\n",
    "X_test_small = bank_data_small_test.drop(['y'], axis=1).copy()\n",
    "y_test_small = pd.DataFrame(bank_data_small_test['y'],columns = ['y'])\n",
    "\n",
    "X_train_small = Scale_perprocessing(X_train_small)\n",
    "X_test_small = Scale_perprocessing(X_test_small)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "X_test_output_path = \"../processed_data/X_test.csv\"\n",
    "y_test_output_path = \"../processed_data/y_test.csv\"\n",
    "X_train_output_path = \"../processed_data/X_train.csv\"\n",
    "y_train_output_path = \"../processed_data/y_train.csv\"\n",
    "\"\"\"\n",
    "X_test_output_path = \"../processed_data/X_test_small.csv\"\n",
    "y_test_output_path = \"../processed_data/y_test_small.csv\"\n",
    "X_train_output_path = \"../processed_data/X_train_small.csv\"\n",
    "y_train_output_path = \"../processed_data/y_train_small.csv\"\n",
    "\"\"\"\n",
    "X_test_small.to_csv(X_test_output_path,index = False)\n",
    "y_test_small.to_csv(y_test_output_path,index = False)\n",
    "X_train_small.to_csv(X_train_output_path,index = False)\n",
    "y_train_small.to_csv(y_train_output_path,index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
